vila-infer \
    --model-path /lustre/hdd/LAS/wzhang-lab/mingl/code/vllm/vlm_ocr/VILA/runs/train/ocr-qwen2-vl-8b-pretrain-sam_clip/model \
    --conv-mode auto \
    --text "Free OCR." \
    --media "/lustre/hdd/LAS/wzhang-lab/mingl/code/vllm/vlm_ocr/workspace/data/OmniDocBench/images/jiaocaineedrop_jiaocai_needrop_en_3146.jpg"